data=fread("fitness_data.csv")
d=na.omit(data)
sub_IDs=sort(unique(d$subID))       #obtaining all the ids
#plotting the graph
graph <- plot_ly(d, x = ~d$`timestamp (in seconds)`,
    transforms = list( 
      list(type = 'filter',
      target = d$subID,
      operation = '=',
      value = sub_IDs[1]
    ))) %>% 
add_trace(y = ~d$`Body Temperature (Celsius)`, name = "Body Temperatue",mode='lines+markers')  %>%
add_trace(y = ~d$`heartrate during activity (bpm)`, name = "heartrate during activity (bpm)",mode='lines+markers', visible = F)  %>%
add_trace(y = ~d$`Acceleration (x axis) in m/s2`, name = "Acceleration (x axis) in m/s2",mode='lines+markers', visible = F)  %>%
add_trace(y = ~d$`Acceleration (y axis) in m/s2`, name = "Acceleration (y axis) in m/s2",mode='lines+markers', visible = F)  %>%
add_trace(y = ~d$`Acceleration (z axis) in m/s2`, name = "Acceleration (z axis) in m/s2",mode='lines+markers', visible = F) %>%
add_trace(y = ~d$`Gyroscope (x axis) in rad/s`, name = "Gyroscope (x axis) in rad/s",mode='lines+markers',visible = F) %>%
add_trace(y = ~d$`Gyroscope (y axis) in rad/s`, name = "Gyroscope (y axis) in rad/s",mode='lines+markers', visible = F) %>%
add_trace(y = ~d$`Gyroscope (z axis) in rad/s`, name = "Gyroscope (z axis) in rad/s", mode='lines+markers',visible = F) %>%
add_trace(y = ~d[,12], name = "Magnetometer (x axis) in μT",mode='lines+markers',mode='lines+markers', visible = F) %>%
add_trace(y = ~d[,13], name = "Magnetometer (y axis) in μT",mode='lines+markers',mode='lines+markers', visible = F) %>%
add_trace(y = ~d[,14], name = "Magnetometer (z axis) in μT",mode='lines+markers', mode='lines+markers',visible = F) %>%
layout(
  title = "Fitness data",

  xaxis = list(
  title="x(timestamp in seconds)",
  rangeselector = list(buttons =
  list(list(
            count = 3, 
            stepmode = "backward"),
            list(step = "all"))),
            rangeslider = list(type = "time")),
  yaxis = list(title = "y"),
  updatemenus = list(
    list(
      type='dropdown',
      active=0,
      x=1.3,y = 0.9,
       buttons= list(
       list(method="restyle",args=list("transforms[0].value",sub_IDs[1]),
        label=sub_IDs[1]),
        list(method="restyle",args=list("transforms[0].value",sub_IDs[2]),
        label=sub_IDs[2]),
        list(method="restyle",args=list("transforms[0].value",sub_IDs[3]),
        label=sub_IDs[3]),
        list(method="restyle",args=list("transforms[0].value",sub_IDs[4]),
        label=sub_IDs[4]),
        list(method="restyle",args=list("transforms[0].value",sub_IDs[5]),
        label=sub_IDs[5]),
        list(method="restyle",args=list("transforms[0].value",sub_IDs[6]),
        label=sub_IDs[6]),
        list(method="restyle",args=list("transforms[0].value",sub_IDs[7]),
        label=sub_IDs[7]),
        list(method="restyle",args=list("transforms[0].value",sub_IDs[8]),
        label=sub_IDs[8]),
        list(method="restyle",args=list("transforms[0].value",sub_IDs[9]),
        label=sub_IDs[9])
    )),
    list(
      x=1.5,y = 0.75,
      buttons = list(
        list(method = "restyle",
        args = list("visible", list(TRUE, FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)),
             label = "Body Temperature (Celsius)"),
 
        list(method = "restyle",
        args = list("visible",list(FALSE, TRUE ,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)),
        label = "heartrate during activity (bpm)" ),
        
         list(method = "restyle",
          args = list("visible",list(FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)),
          label = "Acceleration (x axis) in m/s2"  ),
        
         list(method = "restyle",
        args = list("visible",list(FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)),
        label = "Acceleration (y axis) in m/s2"),
        
         list(method = "restyle",
          args = list("visible",list(FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)),
          label = "Acceleration (z axis) in m/s2"),
        
        list(method = "restyle",
          args = list("visible",list(FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE)),
             label = "Gyroscope (x axis) in rad/s"),
        list(method = "restyle",
          args = list("visible",list(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE)),
             label = "Gyroscope (y axis) in rad/s"),
        list(method = "restyle",
          args = list("visible",list(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE)),
             label = "Gyroscope (z axis) in rad/s"),
    
        list(method = "restyle",
         args = list("visible",list(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE)),
             label = "Magnetometer (x axis) in μT"),
        list(method = "restyle",
          args = list("visible",list(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE)),
             label = "Magnetometer (y axis) in μT"),
        list(method = "restyle",
          args = list("visible",list(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE)),
             label = "Magnetometer (z axis) in μT")
      ))))

graph
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter

Insights gleaned from the plot:- 1.For most of the people,body temperature increases and then decreases with time. 2. The variation in all the other parameters is random.

Question 2

What is undersampling and oversampling? Consider the dataset subject.csv. Is there a case of undersampling or oversampling? If so, mention a technique to remedy the problem. Justify your answer.

#Source: https://zyxo.wordpress.com/2008/12/30/oversampling-or-undersampling/

subject_df  = read.csv("subject.csv") #reading the csv file

print(subject_df)
##   Subject.ID    Sex Age..years. Height..cm. Weight..kg. Resting.HR.BPM.
## 1        101   Male          27         182          83              70
## 2        102 Female          25         169          75              69
## 3        103   Male          31         187          78              60
## 4        104   Male          24         194          74              86
## 5        105   Male          26         180          92              60
## 6        106   Male          26         183          68              87
## 7        107   Male          23         173          95              66
## 8        108   Male          32         179          58              65
## 9        109   Male          31         168          73              54
##   Max.HR..bpm. Dominant.hand
## 1          193         right
## 2          195         right
## 3          189         right
## 4          196         right
## 5          194         right
## 6          194         right
## 7          197         right
## 8          188          left
## 9          189         right
#Checking for undersampling or oversampling in the csv
#We find that the current data is highly imbalanced. 

subject_aggregate_df = aggregate(subject_df$Dominant.hand, by=list(Sex=subject_df$Sex), FUN=length)
barplot(subject_aggregate_df$x,names.arg = subject_aggregate_df$Sex, ylab = "Population")

#The remedy to the problem is either to oversample the Female population or under sample the males
#Oversampling the females seems to be a better approach

Answer 2 : The remedy to the problem is either to oversample the Female population or under sample the males Oversampling the females is a better approach.

Question 3

There are various techniques for sampling data. Suggest a sampling technique that you think is ideal for the data in fitness_data.csv, and justify your choice.

fitness_df = read.csv("fitness_data.csv")
activities_df = read.csv("activities.csv")

#Through these datasets, we notice that there exists only one non-medical field recorded, which is activity ID.
#There are a total of 17 activities being performed, and hence Cluster sampling technique can be utilised to sample the subjects.

Answer 3: Through these datasets, we notice that there exists only one non-medical field recorded, which is activity ID. There are a total of 17 activities being performed, and hence Cluster sampling technique can be utilised to sample the subjects.

Question 4

In August 2018, Election Commission of India made Lok sabha 2014(Lok Sabha-2014 data.csv) data public so that analysts can use it for 2019 Lok Sabha election. Provide a suitable visualisation that accounts for the distribution of votes across the country.

#INPUTTING GOOGLE API KEY #INTERNET CONNECTION REQUIRED TO RUN THIS
register_google(key = 'AIzaSyDwCTPxbxMRd-nYr9b5zNs2FW8jbGluJe0')
map <- get_map(location = 'India', zoom = 5)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=India&zoom=5&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx-nYr9b5zNs2FW8jbGluJe0
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=India&key=xxx-nYr9b5zNs2FW8jbGluJe0
#let's read in the csv
election_df = read.csv("Lok Sabha-2014 data.csv")
#print(election_df)
#Q4 SHINY APP
#We are visualising the spread of voters for each party as it tells us how much prevelance or success a party has had in a certain regioon.
#The circles on map locate the region and the radius of each circle is the corresponding margin of votes by which they won.
#We are using a shiny party selector drop down menu. 

ui<-shinyUI(fluidPage(
  titlePanel("Election Results"),
  # Your input selection
  sidebarPanel(
    selectInput("party", "Choose you Input:", choices = unique(election_df$PARTY))
  ),
  # Show the selected plot
  mainPanel(
    plotOutput("whichplot")
  )
))

server<-shinyServer(function(input, output) {
  
  # Fill in the spot we created for a plot
  output$whichplot <- renderPlot({
    
  bjp = subset(election_df,election_df$PARTY ==input$party,)
  points <- ggmap(map) + geom_point(aes(x = longitude, y = latitude,size = MARGIN), data = bjp ,alpha = .4,color= "orange")
  points <- points + scale_size_area(name = "MARGIN WIN")
  points
  })
})
shinyApp(ui, server)
Shiny applications not supported in static R Markdown documents

Question-5

Many good Bollywood movies were released in 2019, one of them being Kabir Singh. The file tweets.txt contains what people have tweeted about this movie. Provide suitable visualization that depicts the generals sentiment of the audience.

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
library(wordcloud)
library(reshape2)
library(dplyr)
library(tidytext)

filePath <- "tweets.txt"
text <- readLines(filePath)
text <- c(text)

text_df <- tibble(line = 1:12596, text = text)



text_df<-text_df %>%
  unnest_tokens(word, text)

text_df%>%
  count(word, sort = TRUE)
## # A tibble: 19,761 x 2
##    word             n
##    <chr>        <int>
##  1 â            15272
##  2 kabirsingh    7666
##  3 ã             7639
##  4 the           5506
##  5 a             4659
##  6 shahidkapoor  4188
##  7 kabir         4116
##  8 is            4074
##  9 singh         3862
## 10 of            3710
## # … with 19,751 more rows
nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

text_df %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 291 x 2
##    word            n
##    <chr>       <int>
##  1 love          912
##  2 good          529
##  3 brilliant     206
##  4 happy         197
##  5 outstanding   137
##  6 beautiful     123
##  7 finally       114
##  8 star          111
##  9 success       109
## 10 music         105
## # … with 281 more rows
par(mar=c(0,1.5,0.5,0.5),mgp=c(10,1,0))
text_df %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("red", "blue"),
                   max.words = 100)
## Joining, by = "word"